
> ###############################################################################
> ## Data
> ###############################################################################
> 
> ## SOEP Data Set
> soep <- read.dta("dat/proc-data/soepv35_rents.dta") %>%
+   droplevels()

> ## Inspect missingness by year
> soep_missings <- soep %>%
+   split(soep$year) %>%
+   sapply(function(dat)
+     apply(dat, 2, mdesc, mean = TRUE))

> ## Drop empty variables
> soep <- soep %>%
+   dplyr::select(-tie_larea,-want_move,-pref_bcty,-pref_mcty,-pref_town,-pref_cntr,-wr_house)

> ## Drop irrelevant variables
> soep <- soep %>%
+   dplyr::select(
+     -born,-bl,-citizen,-pinc_labor,-log_pinc_labor,-pinc_gross_ttl,-hinc_gross_ttl,-temp,-involpart,-zlwork,-emplrel_r,-emplno_r,-class16_r,-class5_r,-class4,-class6_r,-myclass4,-hh_class,-task,-nace,-starts_with("wsa"),-wttr_job,-cold_rent,-warm_rent,-union
+   )

> soep <- soep %>%
+   dplyr::select(-hh_num_child,-hh_num_ecact,-unemp00,-contact_neighbors)

> soep <- soep %>%
+   dplyr::select(-stf_larng,-stf_larea)

> soep <- soep %>%
+   dplyr::select(
+     -starts_with("__0"),-moved_area_type,-movein_resmove,-mover_type,-mover_type2,-reason_for_moving
+   )

> ## Interpolate assets
> soep <- soep %>%
+   group_by(id) %>%
+   arrange(year) %>%
+   mutate_at(
+     .vars = vars(starts_with("asset") & !contains("type")),
+     .funs = ~ zoo::na.approx(
+       .,
+       maxgap = 4L,
+       rule = 2,
+       na.rm = FALSE
+     )
+   ) %>%
+   mutate_at(
+     .vars = vars(starts_with("asset") & contains("type")),
+     .funs = ~ zoo::na.locf(
+       .,
+       maxgap = 4L,
+       rule = 2,
+       na.rm = FALSE
+     )
+   ) %>%
+   ungroup()

> ## Factors to numeric
> soep <- soep %>%
+   mutate_at(.vars = vars(starts_with("stf_"), lr_self),
+             .funs = ~ as.numeric(.))

> ## Person-year ID, filter years before 2005
> soep <- soep %>%
+   mutate(pyid = paste(id, year, sep = ".")) %>%
+   filter(year >= 2005)

> ## Ownership etc
> soep <- soep %>%
+   mutate(renter_no_rent = as.numeric(owner == "renter (no rent)")) %>%
+   mutate(owner = as.numeric(owner == "owner")) %>%
+   droplevels()

> ## Recode missing category in wr_jbsec
> soep <- soep %>%
+   mutate(
+     wr_jbsec =
+       recode_factor(
+         wr_jbsec,
+         "[-6] Fragebogenversion mit geaenderter Filterfuehrung" = NA_character_
+       )
+   ) %>%
+   droplevels()

> ## Recode convoluted level names in lm_part
> soep <- soep %>%
+   mutate(lm_part =
+            recode_factor(
+              lm_part,
+              "Active, Atypical" = "Atypical",
+              " (Mostly) inactive" = "Inactive"
+            )) %>%
+   droplevels()

> ## Moving and missingness
> soep <- soep %>%
+   mutate(mover = as.numeric(mover) - 1L) %>%
+   mutate(mover = coalesce(mover, 0L))

> soep <- soep %>%
+   mutate_at(
+     .vars = vars(
+       starts_with("moved_"),
+       starts_with("reason_for_moving"),
+       social_housing
+     ),
+     .funs = ~ ifelse(mover == 0 & is.na(.),-2, .)
+   )

> ## Sample restriction: Only keep individuals who joined the panel pre-2015
> soep <- soep %>%
+   filter(erstbefr <= 2014)

> ###############################################################################
> ## Imputation of SOEP Data
> ###############################################################################
> 
> ## Define variables: ID, nominal ordinal, left-bounded (0)
> soep_ids <- c(
+   "hh_id",
+   "pyid",
+   "wave",
+   "lweight",
+   "xweight1",
+   "xweight2",
+   "xweight3",
+   "weight",
+   "psample",
+   "erstbefr",
+   "regbez",
+   "ggk",
+   "gtyp",
+   "bik",
+   "gkz",
+   "kkz",
+   "plz",
+   "change",
+   "gkzname",
+   "ror96",
+   "nuts3",
+   "nuts2",
+   "nuts1",
+   "kr_kkz",
+   "kr_kkz_rek",
+   "kr_kkz_rek90",
+   "kr_utmost",
+   "kr_utmnord",
+   "kr_uemprate",
+   "kr_foreigner",
+   "kr_emprate",
+   "kr_area",
+   "kr_hhinc",
+   "kr_gdp_pem",
+   "kr_gdp_pc",
+   "resmove",
+   "distance",
+   "chg_kkz",
+   "chg_zip",
+   "plz_lag",
+   "kkz_lag",
+   "kr_kkz_rek_lag",
+   "resmove_full",
+   "chg_kkz_full",
+   "chg_zip_full",
+   "pgexpue",
+   "class8_r",
+   "warm_rent_load",
+   "warm_rent_sqm",
+   "selfem_mainjob",
+   "task3",
+   "hinc",
+   "hinc_eq",
+   "imputed_rent",
+   "yrs_active",
+   "age5",
+   "prop_rentinc_hinc",
+   "pyid",
+   "unemp_frac",
+   "owner",
+   "renter_no_rent",
+   names(soep)[grepl("asset", names(soep))],
+   "mover",
+   "moved_finances",
+   "moved_size",
+   "moved_furnishings",
+   "moved_area",
+   "moved_environment",
+   "moved_connections",
+   "moved_neighborhood",
+   "moved_contract",
+   "hh_at_current_address",
+   "reason_for_moving_costs",
+   "reason_for_moving_comfort"
+ )

> soep_nom <- c(
+   "hh_comp",
+   "myclass4_r",
+   "partyid",
+   "hh_role",
+   "vote",
+   "lm_part",
+   "fem",
+   "migr3",
+   "east",
+   "social_housing"
+ )

> soep_ord <- c("edu5",
+               "wr_jbsec",
+               "wr_immig",
+               "wr_ecego",
+               "wr_ecnat",
+               "wr_crime",
+               "pol_int")

> soep_sqr <- c(
+   "log_hinc_eq",
+   "yrs_since_movein",
+   "cold_rent_sqm",
+   "home_size",
+   "imputed_rent_per_sqm",
+   "hh_mmb",
+   "age"
+ )

> soep_con <- c("stf_hinc",
+               "stf_dwell",
+               "stf_pinc",
+               "stf_life",
+               "stf_work",
+               "lr_self")

> soep_lgs <- c("hh_prop_ecact",
+               "prop_personal_hinc",
+               "hh_prop_child",
+               "cold_rent_load")

> soep_cs <- "id"

> soep_ts <- "year"

> ## Save indicators of wave missingness
> soep_niw <- soep[, grepl("niw_", names(soep))]

> names(soep_niw) <-
+   substr(names(soep_niw), 5, nchar(names(soep_niw)))

> soep_niw <- cbind(soep$year, soep_niw)

> names(soep_niw)[1] <- "year"

> vars_to_impute <- c(soep_nom,
+                     soep_ord,
+                     soep_sqr,
+                     soep_lgs,
+                     soep_con)

> soep_niw <- soep_niw %>%
+   dplyr::select(c("year",
+                   names(soep_niw)[names(soep_niw) %in% vars_to_impute])) %>%
+   group_by(year) %>%
+   summarize_all(.fun = unique)

> ## Select relevant variables
> vars <- c(soep_nom,
+           soep_ord,
+           soep_sqr,
+           soep_lgs,
+           soep_con,
+           soep_cs,
+           soep_ts,
+           soep_ids)

> soep <- soep %>%
+   dplyr::select(all_of(vars))

> ## Drop IDs (idvars options return error, we'll add them back later)
> soep_to_imp <- soep %>%
+   dplyr::select(-all_of(soep_ids))

> ## Select lead/lag vars
> ll_vars <- names(soep_to_imp)[!(
+   names(soep_to_imp) %in% c(
+     "fem",
+     "migr3",
+     "east",
+     "mover",
+     "yrs_since_movein",
+     "id",
+     "year"
+   )
+ )]

> ## Proportion bounds
> bounds <- matrix(cbind(which(names(soep_to_imp) %in% soep_lgs), 0, 1),
+                  nrow = sum(names(soep_to_imp) %in% soep_lgs),
+                  ncol = 3L)

> ## Sequential imputation (Amelia)
> soep_imp <- amelia(
+   soep_to_imp,
+   m = 5,
+   noms = soep_nom,
+   ords = soep_ord,
+   sqrts = soep_sqr,
+   bounds = bounds,
+   seed = 2021101,
+   ts = soep_ts,
+   cs = soep_cs,
+   autopri = FALSE,
+   p2s = 2,
+   parallel = "snow",
+   ncpus = 5
+ )

Using ' snow ' parallel backend with 5 cores.
amelia starting
beginning prep functions
Variables used:  edu5 wr_jbsec wr_immig wr_ecego wr_ecnat wr_crime pol_int log_hinc_eq yrs_since_movein cold_rent_sqm home_size imputed_rent_per_sqm hh_mmb age hh_prop_ecact prop_personal_hinc hh_prop_child cold_rent_load stf_hinc stf_dwell stf_pinc stf_life stf_work lr_self noms.hh_comp.2 noms.hh_comp.3 noms.hh_comp.4 noms.myclass4_r.2 noms.myclass4_r.3 noms.myclass4_r.4 noms.partyid.2 noms.partyid.3 noms.partyid.4 noms.partyid.5 noms.partyid.6 noms.partyid.7 noms.partyid.8 noms.partyid.9 noms.hh_role.2 noms.hh_role.3 noms.hh_role.4 noms.hh_role.5 noms.vote.2 noms.vote.3 noms.vote.4 noms.vote.5 noms.vote.6 noms.vote.7 noms.vote.8 noms.vote.9 noms.lm_part.2 noms.lm_part.3 noms.lm_part.4 noms.lm_part.5 noms.lm_part.6 noms.fem.2 noms.migr3.2 noms.east.2 noms.social_housing.2 noms.social_housing.3 

> ## ----Post - processing----
> ## Add back unimputed/ID variables
> asset_vars <- c(
+   "asset_re_hom_p",
+   "asset_re_hom_m",
+   "asset_re_hom_t",
+   "asset_re_oth_p",
+   "asset_re_oth_m",
+   "asset_re_oth_t",
+   "asset_re_ttl_p",
+   "asset_re_ttl_m",
+   "asset_re_ttl_t",
+   "asset_ov_ttl_t",
+   "asset_re_type"
+ )

> for (m in seq_along(soep_imp$imputations)) {
+   asset_vars_m <- sapply(asset_vars, paste0, m)
+   soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
+     left_join(soep %>%
+                 dplyr::select(id,
+                               year,
+                               all_of(soep_ids[!(startsWith(soep_ids, "asset"))]),
+                               all_of(asset_vars_m)),
+               by = c("id", "year"))
+ }

> ## Retrieve imputed values from transformed variables
> for (m in seq_along(soep_imp$imputations)) {
+   soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
+     mutate(
+       hinc_eq = exp(log_hinc_eq) - 1,
+       log_hinc = log((exp(log_hinc_eq) - 1) * sqrt(hh_mmb)),
+       imputed_rent = imputed_rent_per_sqm * home_size,
+       age5 = case_when(
+         age < 25 ~ "< 25",
+         age >= 25 & age <= 34 ~ "25-34",
+         age >= 35 & age <= 44 ~ "35-44",
+         age >= 45 & age <= 54 ~ "45-54",
+         age >= 55 ~ ">=55",
+         TRUE ~ NA_character_
+       )
+     ) %>%
+     mutate_at(.vars = vars(starts_with("stf_")),
+               .funs = ~ (. - 1)) %>%
+     mutate_at(.vars = vars(starts_with("wr_")),
+               .funs = ~ (-as.numeric(.)) + 2) %>%
+     mutate_at(.vars = vars(wr_jbsec, stf_work),
+               .funs = ~ ifelse(!(lm_part %in% c(
+                 "Active", "Atypical"
+               )), NA_integer_, .)) %>%
+     mutate_at(
+       .vars = vars(cold_rent_sqm, cold_rent_load),
+       .funs = ~ ifelse(owner == 1, NA_real_, .)
+     ) %>%
+     mutate_at(
+       .vars = vars(
+         starts_with("moved_"),
+         starts_with("reason_for_moving"),
+         social_housing
+       ),
+       .funs = ~ ifelse(. == -2, NA_integer_, .)
+     )
+ }

> ## Recode wave-wise missing items back to missing
> for (m in seq_along(soep_imp$imputations)) {
+   for (v in 2:ncol(soep_niw)) {
+     v_name <- names(soep_niw)[v]
+     to_drop <- soep_niw$year[soep_niw[[v_name]] == 1]
+     soep_imp$imputations[[m]][[v_name]][soep_imp$imputations[[m]][["year"]] %in% to_drop] <-
+       NA
+   }
+ }

> ## Save
> save(soep_imp, file = "dat/proc-data/soep_imp.RData")
